Loading Data
The following libraries will be used.
library(data.table) # database-like tables
library(ggplot2) # general plots
library(highcharter) # interactive plots
library(DT) # display tables
library(corrplot) # corr SPLOMs
library(vcd) # stats for categories
library(mice) # for imputation
Input is loaded and covnerted to data.table.
test <- read.csv("../input/test.csv")
train <- read.csv("../input/train.csv")
test <- data.table(test)
train <- data.table(train)
The number of dimensions and sampling density is calculated. The datasets are combined.
(N <- nrow(train))
## [1] 1460
(p <- ncol(train) - 2)
## [1] 79
N^(1/p)
## [1] 1.096617
test$SalePrice <- NaN
full <- rbind(train, test)
From the data_description variable types can be defined. There are 4 categories: nominal, ordinal, discrete, continuous. A lot of the discrete variables could arguably be handled as ordinals.
nomVars <- c("MSZoning", "Street", "Alley", "LandContour", "LotConfig",
"Neighborhood", "Condition1", "Condition2", "HouseStyle",
"RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd",
"MasVnrType", "Foundation", "Heating", "CentralAir", "Electrical",
"GarageType", "PavedDrive", "MiscFeature", "SaleType",
"SaleCondition")
ordVars <- c("MSSubClass", "LotShape", "Utilities", "LandSlope", "BldgType",
"OverallQual", "OverallCond", "ExterQual", "ExterCond", "BsmtQual",
"BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2",
"HeatingQC", "KitchenQual", "Functional", "FireplaceQu",
"GarageFinish", "GarageQual", "GarageCond", "PoolQC", "Fence",
"MoSold")
discVars <- c("YearBuilt", "YearRemodAdd", "BsmtFullBath", "BsmtHalfBath",
"FullBath", "HalfBath", "BedroomAbvGr", "KitchenAbvGr",
"TotRmsAbvGrd", "Fireplaces", "GarageYrBlt", "GarageCars",
"YrSold")
contVars <- c("LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF",
"TotalBsmtSF", "X1stFlrSF", "X2ndFlrSF", "LowQualFinSF",
"GrLivArea", "GarageArea", "WoodDeckSF", "OpenPorchSF",
"EnclosedPorch", "X3SsnPorch", "ScreenPorch", "PoolArea",
"MiscVal", "LotFrontage")
A table is created to store feature attributes.
featAttr <- data.table(
name = colnames(test)[-1],
type = "nominal"
)
featAttr[name %in% ordVars, type := "ordinal"]
featAttr[name %in% discVars, type := "discrete"]
featAttr[name %in% contVars, type := "continuous"]
Special Values
According to data_description several \(NA\) have a special meaning. These values are replaced with more descriptive chr strings. Before, test and train set are combined.
full$label <- rep(c("train", "test"), c(nrow(test), nrow(train)))
full[is.na(Alley), Alley := "noAccess"]
full[is.na(BsmtQual), BsmtQual := "noBasement"]
full[is.na(BsmtCond), BsmtCond := "noBasement"]
full[is.na(BsmtExposure), BsmtExposure := "noBasement"]
full[is.na(BsmtFinType1), BsmtFinType1 := "noBasement"]
full[is.na(BsmtFinType2), BsmtFinType2 := "noBasement"]
full[is.na(FireplaceQu), FireplaceQu := "noFireplace"]
full[is.na(GarageType), GarageType := "noGarage"]
full[is.na(GarageFinish), GarageFinish := "noGarage"]
full[is.na(GarageQual), GarageQual := "noGarage"]
full[is.na(GarageCond), GarageCond := "noGarage"]
full[is.na(PoolQC) , PoolQC := "noPool"]
full[is.na(Fence), Fence := "noFence"]
full[is.na(MiscFeature), MiscFeature := "none"]
Nominal, ordinal, and discrete variables are converted to factor, continuous to numeric.
for (j in c(nomVars, ordVars, discVars)) full[[j]] <- as.factor(full[[j]])
for (j in contVars) full[[j]] <- as.numeric(full[[j]])
Ordering
From data_description the order of ordinal variable classes can be derived. These orderings are defined, then the levels of their factors are adjusted.
lvlOrd <- list(
LotShape = c("Reg", "IR1", "IR2", "IR3"),
Utilities = c("AllPub", "NoSewr", "NoSeWa", "ELO"),
LandSlope = c("Gtl", "Mod", "Sev"),
BldgType = c("1Fam", "2FmCon", "Duplx", "TwnhsE", "TwnhsI"),
OverallQual = 10:1,
OverallCond = 10:1,
ExterQual = c("Ex", "Gd", "TA", "Fa", "Po"),
ExterCond = c("Ex", "Gd", "TA", "Fa", "Po"),
BsmtQual = c("Ex", "Gd", "TA", "Fa", "Po", "noBasement"),
BsmtCond = c("Ex", "Gd", "TA", "Fa", "Po", "noBasement"),
BsmtExposure = c("Gd", "Av", "Mn", "No", "noBasement"),
BsmtFinType1 = c("GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "noBasement"),
BsmtFinType2 = c("GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "noBasement"),
HeatingQC = c("Ex", "Gd", "TA", "Fa", "Po"),
KitchenQual = c("Ex", "Gd", "TA", "Fa", "Po"),
Functional = c("Typ", "Min1", "Min2", "Mod", "Maj1", "Maj2", "Sev", "Sal"),
FireplaceQu = c("Ex", "Gd", "TA", "Fa", "Po", "noFireplace"),
GarageFinish = c("Fin", "RFn", "Unf", "noGarage"),
GarageQual = c("Ex", "Gd", "TA", "Fa", "Po", "noGarage"),
GarageCond = c("Ex", "Gd", "TA", "Fa", "Po", "noGarage"),
PoolQC = c("Ex", "Gd", "TA", "Fa", "noPool"),
Fence = c("GdPrv", "MnPrv", "GdWo", "MnWw", "noFence"),
MoSold = 1:12
)
for (j in names(lvlOrd)) {
full[[j]] <- factor(full[[j]], levels = lvlOrd[[j]])
}
Some features’ values heavily depend on the value of another feature. For example the values of \(BsmtUnfSF\), \(TotalBsmtSF\) only make sense if features \(BsmtCond\), \(BsmtExposure\), \(BsmtQual\) are not \(noBasement\). This information is given in data_description.
\[ \begin{aligned} BsmtUnfSF, TotalBsmtSF : BsmtCond, BsmtExposure, BsmtQual \ne noBasement\\ BsmtFinSF1 : BsmtFinType1 \ne noBasement\\ BsmtFinSF2 : BsmtFinType2 \ne noBasement \\ GarageYrBlt, GarageCars, GarageArea: GarageType, GarageFinish, GarageQual, GarageCond \ne noGarage \\ PoolArea : PoolQC \ne noPool \\ MiscVal : MiscFeature \ne none \\ MasVnrArea : MasVnrType \ne none \end{aligned} \] For the garage and basement features there are several features which indicate the absence of that feature. If one of them indicates the absence of a feature, the others should too. This can be tested with the difference between two sets.
outersect <- function(x, y) sort(c(setdiff(x, y), setdiff(y, x)))
The differences between Ids of observations with noBasement are computed.
outersect(which(full$BsmtCond == "noBasement"),
which(full$BsmtExposure == "noBasement"))
## [1] 949 1488 2041 2186 2349 2525
outersect(which(full$BsmtCond == "noBasement"),
which(full$BsmtQual == "noBasement"))
## [1] 2041 2186 2218 2219 2525
full[c(949, 1488, 2041, 2186, 2218, 2219, 2349, 2525),
.(Id, BsmtCond, BsmtExposure, BsmtQual, BsmtUnfSF, TotalBsmtSF)]
## Id BsmtCond BsmtExposure BsmtQual BsmtUnfSF TotalBsmtSF
## 1: 949 TA noBasement Gd 936 936
## 2: 1488 TA noBasement Gd 1595 1595
## 3: 2041 noBasement Mn Gd 0 1426
## 4: 2186 noBasement No TA 94 1127
## 5: 2218 Fa No noBasement 173 173
## 6: 2219 TA No noBasement 356 356
## 7: 2349 TA noBasement Gd 725 725
## 8: 2525 noBasement Av TA 240 995
So here the data_description is not consistent. The general condition is set to noBasement but there are values for basement exposure, quality, and so on. noBasement was translated from NAs. Since for all the other basement related variables there are reasonable values it might be that in these cases NA did not mean noBasement.
The same can be done for the garage related variables.
outersect(which(full$GarageType == "noGarage"),
which(full$GarageFinish == "noGarage"))
## [1] 2127 2577
outersect(which(full$GarageType == "noGarage"),
which(full$GarageQual == "noGarage"))
## [1] 2127 2577
outersect(which(full$GarageType == "noGarage"),
which(full$GarageCond == "noGarage"))
## [1] 2127 2577
full[c(2127, 2577),
.(GarageYrBlt, GarageCars, GarageArea, GarageType,
GarageFinish, GarageQual, GarageCond)]
## GarageYrBlt GarageCars GarageArea GarageType GarageFinish GarageQual
## 1: NA 1 360 Detchd noGarage noGarage
## 2: NA NA NA Detchd noGarage noGarage
## GarageCond
## 1: noGarage
## 2: noGarage
Here, most garage related features were missing.
Mark Undefined Variables
If e.g. PoolQC is noPool, then there should be a missing vlaue or a zero for PoolArea. If there actually is a value, this could either mean it is a mistake or it means something special that is not clearly described. To at least be consistent, all NAs will be set to 0.
full[BsmtCond == "noBasement" & is.na(BsmtUnfSF), BsmtUnfSF := 0]
full[BsmtCond == "noBasement" & is.na(TotalBsmtSF), TotalBsmtSF := 0]
full[BsmtFinType1 == "noBasement" & is.na(BsmtFinSF1), BsmtFinSF1 := 0]
full[BsmtFinType2 == "noBasement" & is.na(BsmtFinSF2), BsmtFinSF2 := 0]
full[GarageType == "noGarage" & is.na(GarageYrBlt), GarageYrBlt := "0"]
full[GarageType == "noGarage" & is.na(GarageCars), GarageCars := "0"]
full[GarageType == "noGarage" & is.na(GarageArea), GarageArea := 0]
full[PoolQC == "noPool" & is.na(PoolArea), PoolArea := 0]
full[MiscFeature == "noFeature" & is.na(MiscVal), MiscVal := 0]
\(NA\) which are left must be true missing data.
naFeats <- apply(full[, !"SalePrice"], 2, function(x) any(is.na(x)))
naObs <- apply(full[, !"SalePrice"], 1, function(x) any(is.na(x)))
Features
The total number of missing values per feature will be saved. The features with at least one missing value are shown below.
total <- apply(full[, naFeats, with = FALSE], 2, function(j) sum(is.na(j)))
df <- data.frame(
missing = total / nrow(full),
name = factor(names(total), levels = names(total)[order(total)]),
type = featAttr[match(names(total), featAttr$name), type]
)
ggplot(df, aes(x = name, y = missing, fill = type)) +
geom_bar(stat = "identity") +
scale_y_continuous("proportion missing") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ggtitle("Features with Missing Values")
Observations
Below the number of missing values for observations that have missing values is shown. Many observations have 1 or 2, some have 3 or 4 missing values.
total <- apply(full[naObs, ], 1, function(i) sum(is.na(i)))
df <- data.frame(
y = total,
name = full[naObs, Id]
)
highchart() %>%
hc_chart(type = "column") %>%
hc_title(text = "Observations with Missing Values") %>%
hc_xAxis(title = list(text = "observations"), categories = df$name,
labels = list(rotation = "-45")) %>%
hc_yAxis(title = list(text = "missing"), align = "left") %>%
hc_add_series(showInLegend = FALSE, data = list_parse(df[order(df$y), ]))
Feature distributions are described for each type of variable separately.
Values were log10 transformed to better capture their spread. There are many zeros, partly because of the dependency to another variable. These zeros are excluded during the transformation.
dt <- melt(full, "Id", contVars)
ggplot(dt, aes(x = value)) +
geom_histogram(bins = 30) +
scale_x_continuous(trans = "log10") +
facet_wrap(~ variable, scale = "free") +
theme_bw()
Summaries for each distribution are computed. There are many undefined variables which are set to zero. This influences the description a lot. Therefore, they are excluded.
dt <- full[, contVars, with = FALSE]
df <- data.frame(
Min = apply(dt, 2, function(x) min(x[x > 0], na.rm = TRUE)),
Mean = apply(dt, 2, function(x) mean(x[x > 0], na.rm = TRUE)),
Median = apply(dt, 2, function(x) median(x[x > 0], na.rm = TRUE)),
Max = apply(dt, 2, function(x) max(x[x > 0], na.rm = TRUE))
)
Relative difference between mean and median (relDelta) are computed as an indicator for skewness. Also, relative cardinality and relative number of outliers are computed. Outliers are defined as being differing at least \(+/- 1.5 MAD\) from the median.
df$relDelta <- abs(df$Mean - df$Median) / df$Mean
df$relCard <- apply(dt, 2, function(x) {
length(unique(x[x > 0])) / length(x[x > 0])
})
df$relOut <- apply(dt, 2, function(x) {
length(boxplot.stats(x[x > 0])$out) / length(x[x > 0])
})
PoolArea and MiscVal show a high proportion of outliers, but this might be because there are not many total values for these variables. Many supposably continuous variables have a quite low cardinality.
DT::datatable(df, options = list(dom = "ftp", pageLength = 20)) %>%
formatStyle('relDelta',
color = styleInterval(c(0.2, 0.8), c('green', 'orange', 'red'))) %>%
formatStyle('relCard',
color = styleInterval(c(0.2, 0.6), c('red', 'orange', 'green'))) %>%
formatStyle('relOut',
color = styleInterval(c(0.05, 0.1), c('green', 'orange', 'red'))) %>%
formatRound(c(2, 5:7), 3)
Distributions are shown below.
dt <- melt(full, "Id", discVars)
ggplot(dt, aes(x = value)) +
geom_bar() +
facet_wrap(~ variable, scales = "free") +
theme_bw()
As summary, the value range, and cardinality are computed.
dt <- full[, discVars, with = FALSE]
df <- data.frame(
Card = apply(dt, 2, function(x) length(unique(x[!is.na(x)]))),
Min = apply(dt, 2, function(x) min(as.numeric(as.character(x)), na.rm = TRUE)),
Max = apply(dt, 2, function(x) max(as.numeric(as.character(x)), na.rm = TRUE))
)
Apart from that, the modes are usually interesting. For this a function has to be defined first.
Mode <- function(x, fst = TRUE) {
ux <- unique(x)
tab <- tabulate(match(x, ux))
modeIdx <- which.max(tab)
if (fst) return(list(value = ux[modeIdx], freq = tab[modeIdx]))
tab[modeIdx] <- 0
modeIdx <- which.max(tab)
return(list(value = ux[modeIdx], freq = tab[modeIdx]))
}
First and second modes are computed and the table is shown. Modes that make up more than 50% of observations will be highlighted.
df$Mode <- apply(dt, 2, function(x) Mode(x)$value)
df$ModeCount <- apply(dt, 2, function(x) Mode(x)$freq)
df$Mode2nd <- apply(dt, 2, function(x) Mode(x, fst = FALSE)$value)
df$ModeCount2nd <- apply(dt, 2, function(x) Mode(x, fst = FALSE)$freq)
DT::datatable(df, options = list(dom = "ftp", pageLength = 20)) %>%
formatStyle('ModeCount',
color = styleInterval(nrow(dt) / 2, c("black", 'orange')))
In GarageYrBlt the maximum value is 2207, which does not make sense. It seems that this is a typo and the real value is 2007. After checking that there are not more values above 2010 2207 is changed to 2007.
full[GarageYrBlt == 2207, GarageYrBlt := "2007"]
Ordinal Variables
dt <- melt(full, "Id", ordVars)
ggplot(dt, aes(x = value)) +
geom_bar() +
facet_wrap(~ variable, scales = "free") +
theme_bw()
Nominal Variables
dt <- melt(full, "Id", nomVars)
ggplot(dt, aes(x = value)) +
geom_bar() +
facet_wrap(~ variable, scales = "free") +
theme_bw()
Summary
Cardinality, 1st and 2nd modes are given as summary. Modes that make up more than 50% of observations will be highlighted.
dt <- full[, c(ordVars, nomVars), with = FALSE]
df <- data.frame(
Card = apply(dt, 2, function(x) length(unique(x[!is.na(x)]))),
Mode = apply(dt, 2, function(x) Mode(x)$value),
ModeCount = apply(dt, 2, function(x) Mode(x)$freq),
Mode2nd = apply(dt, 2, function(x) Mode(x, fst = FALSE)$value),
ModeCount2nd = apply(dt, 2, function(x) Mode(x, fst = FALSE)$freq),
Type = rep(c("ord", "nom"), c(length(ordVars), length(nomVars)))
)
DT::datatable(df, options = list(dom = "ftp", pageLength = 20)) %>%
formatStyle('Type',
color = styleEqual(unique(df$Type), c('orange', 'blue'))) %>%
formatStyle('ModeCount',
color = styleInterval(nrow(dt) / 2, c("black", 'orange')))
vars <- c(contVars, discVars)
The Pearson Correlation Coefficient is calculated for all feature pairs where both features are continuous. The results are shown as heatmap below. Using euclidean distance of the correlation matrix as a metric rows and columns were hierarchically ordered (complete link).
m <- cor(
full[, contVars, with = FALSE],
use = "complete.obs"
)
corrplot(m, method = "color", order = "hclust")
Some clusters are visible. The same hierarchical clustering is shown as tree below.
corrClust <- hclust(dist(m))
plot(corrClust)
Here, discrete variables are included.
vars <- c(discVars, ordVars, nomVars)
For categorical features Cramer’s V is calculated for all feature pairs to indicate correlation. The results are shown as heatmap below.
m <- as.matrix(full[, vars, with = FALSE])
m <- m[complete.cases(m), ]
mCramer <- matrix(NA, ncol(m), ncol(m))
for (i in seq_len(ncol(m))) {
mCramer[i, ] <- vapply(
seq_len(ncol(m)),
function(j) assocstats(table(m[, i], m[, j]))$cramer,
numeric(1)
)
}
Using euclidean distance of the correlation matrix as a metric rows and columns were hierarchically ordered (complete link).
colnames(mCramer) <- colnames(m)
rownames(mCramer) <- colnames(m)
corrplot(mCramer, method = "color", order = "hclust")
The tree for the clustering:
corrClust <- hclust(dist(mCramer))
plot(corrClust)
Single missing values in basement related features, where all other basement related features did have a value, are replaced by their mode.
full[c(949, 1488, 2349), BsmtExposure := "No"]
full[c(2041, 2186, 2525), BsmtCond := "TA"]
full[c(2218, 2219), BsmtQual := "TA"]
For garage related features there are two cases with almost no non-missing value for garage. For these two cases the garage features are set to noGarage.
full[c(2127, 2577), GarageYrBlt := "0"]
full[c(2127, 2577), GarageCars := "0"]
full[c(2127, 2577), GarageArea := 0]
full[c(2127, 2577), GarageType := "noGarage"]
Features with only very few missing values will be imputed using the mode for categories and the median for continuous variables.
full[is.na(Exterior1st), Exterior1st := "VinylSd"]
full[is.na(Exterior2nd), Exterior2nd := "VinylSd"]
full[is.na(Electrical), Electrical := "SBrkr"]
full[is.na(KitchenQual), KitchenQual := "TA"]
full[is.na(SaleType), SaleType := "WD"]
full[is.na(Utilities), Utilities := "AllPub"]
full[is.na(Functional), Functional := "Typ"]
full[is.na(MSZoning), MSZoning := "RL"]
full[is.na(MasVnrArea), MasVnrArea := 202]
full[is.na(MasVnrType), MasVnrType := "none"]
There were two observations with NA values for basement bath related variables in which all other basement related variables were set to noBasement. These will be set to zreo.
full[is.na(BsmtFullBath), BsmtFullBath := "0"]
full[is.na(BsmtHalfBath), BsmtHalfBath := "0"]
BldgType and LotFrontage have around 10-15% missing values. These will be imputed by Gibbs sampling of the posterior distribution. Variables with missing values are repeatedly predicted given a set of predictor variables. For LotFrontage predictive mean matching, and for BldgType polytomous regression will be used. As predictors all continuous variables and the the 3 categorical variables which are most correlated to BldgType are included. Several datasets are created by this method but only one will be used.
vars <- c(contVars, "BldgType", "MSSubClass", "BedroomAbvGr", "TotRmsAbvGrd")
dt <- full[, vars, with = FALSE]
imp <- mice(dt, seed = 42)
LotFrontage
The distribution of LotFrontage after imputation is compared to the original distribution.
df <- data.frame(
LotFrontage = c(dt$LotFrontage, complete(imp)$LotFrontage),
Distro = rep(c("Original", "Imputed"), each = nrow(dt))
)
ggplot(df, aes(x = LotFrontage, color = Distro)) +
geom_density() +
theme_bw()
Both distributions look very similar. Furthermore, its correlation with the 4 most correlated continuous variables is compared.
df <- data.frame(
Value = c(dt$LotArea, dt$MasVnrArea, dt$GrLivArea, dt$GarageArea),
Variable = rep(c("LotArea", "MasVnrArea", "GrLivArea", "GarageArea"),
each = nrow(dt)),
LotFrontage = rep(complete(imp)$LotFrontage, each = 4),
isImputed = factor(is.na(dt$LotFrontage), levels = c(TRUE, FALSE))
)
ggplot(df, aes(x = LotFrontage, y = Value, color = isImputed)) +
geom_point(size = .7) +
facet_wrap( ~ Variable, scales = "free") +
theme_bw()
Imputed values seem to resemble the shape of the original data.
BldgType
Imputed and original distributions are compared.
df <- data.frame(
BldgType = c(as.character(dt$BldgType), as.character(complete(imp)$BldgType)),
Distro = rep(c("Original", "Imputed"), each = nrow(dt))
)
ggplot(df, aes(x = BldgType, fill = Distro)) +
geom_bar(position = "dodge") +
scale_x_discrete() +
theme_bw()
Next correlation to the closest correlated categorical feature MSSubClass before and after imputation is checked.
df <- data.frame(
BldgType = c(as.character(dt$BldgType), as.character(complete(imp)$BldgType)),
MSSubClass = rep(dt$MSSubClass, 2),
Distro = rep(c("Original", "Imputed"), each = nrow(dt))
)
ggplot(df, aes(x = MSSubClass, fill = BldgType)) +
geom_bar(position = "dodge") +
facet_grid(Distro ~ .) +
theme_bw()
Both
Finally, the correlation of both imputed variables is checked.
df <- data.frame(
LotFrontage = rep(c(dt$LotFrontage, complete(imp)$LotFrontage), each = 2),
BldgType = rep(c(as.character(dt$BldgType),
as.character(complete(imp)$BldgType)), each = 2),
Distro = rep(c("Original", "Imputed"), each = 2*nrow(dt))
)
ggplot(df, aes(x = LotFrontage, color = BldgType)) +
geom_density() +
facet_grid(Distro ~ .) +
theme_bw()
## Warning: Removed 972 rows containing non-finite values (stat_density).
The distribution of \(BldgType = TwnhsE\) changed visibly. However, with less than 10% this value is quite underrepresented so its distribution estimate is less stable. All in all the imputed values seem to be useful